In [2]:
# http://nbviewer.ipython.org/github/jming/cs109/blob/master/lec_10_cross_val.ipynb

In [ ]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.linear_model import LinearRegression

In [5]:
def hidden_model(x):
    # y is a linear combination of columns 5 and 10...
    result = x[:, 5] + x[:, 10]
    # ... with a little noise
    # loc is mean, scale is standard deviation
    result += np.random.normal(loc=0, scale=.005, size=result.shape)
    return result

def make_x(nobs):
    return np.random.uniform(low=0, high=3, size=(nobs, 10**6))

x = make_x(20)
y = hidden_model(x)

print x.shape


(20L, 1000000L)

In [7]:
selector = SelectKBest(f_regression, k=2).fit(x, y)
best_features = np.where(selector.get_support())[0]
print best_features


[595299 969164]

In [11]:
# get_support returns boolean array with KBest features as True, rest False
selector.get_support()[0:10]


Out[11]:
array([False, False, False, False, False, False, False, False, False, False], dtype=bool)

In [13]:
# when given only one array, returns indices of non-zero(True) values
np.where(selector.get_support())


Out[13]:
(array([595299, 969164], dtype=int64),)

In [18]:
for b in best_features:
    plt.plot(x[:, b], y, 'o')
    plt.title('Column %i' % b)
    plt.xlabel('X')
    plt.ylabel('Y')
    plt.show()



In [19]:
xt = x[:, best_features]
clf = LinearRegression().fit(xt, y)
print "Score(R^2) is :", clf.score(xt, y)


Score(R^2) is : 0.829452455754

In [20]:
y_pred = clf.predict(xt)
plt.plot(y_pred, y, 'o')
plt.plot(y, y, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')


Out[20]:
<matplotlib.text.Text at 0x13fb8cf8>

In [21]:
cross_val_score(clf, xt, y, cv=5).mean()


Out[21]:
0.53542337114497585

In [26]:
for train, test in KFold(n=len(y), n_folds=10):
    xtrain, xtest, ytrain, ytest = xt[train], xt[test], y[train], y[test]
    
    clf.fit(xtrain, ytrain)
    y_pred = clf.predict(xtest)
    
    plt.plot(y_pred, ytest, 'o')
    plt.plot(ytest, ytest, 'r-')
    
plt.xlabel('Predicted')
plt.ylabel('Observed')


Out[26]:
<matplotlib.text.Text at 0x139aa470>

In [27]:
x2 = make_x(100)
y2 = hidden_model(x2)
x2 = x2[:, best_features]

y2_pred = clf.predict(x2)

plt.plot(y2_pred, y2, 'o')
plt.plot(y2, y2, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')


Out[27]:
<matplotlib.text.Text at 0x137444e0>

Our model fails! There's not really any correlation because we used the entire dataset to select our K-best features. Because of this, cross-validation did not detect the overfitting.

The right way to cross-validate.

To combat overfitting, we cannot peak into the cross-validation set at all. To accomplish this, we need to find the k-best features in each cross-validation iteration.


In [30]:
scores = []
best_features_list = []

for train, test in KFold(len(y), n_folds=5):
    xtrain, xtest, ytrain, ytest = x[train], x[test], y[train], y[test]
    
    b = SelectKBest(score_func=f_regression, k=2)
    b.fit(xtrain, ytrain)
    xtrain = xtrain[:, b.get_support()]
    xtest = xtest[:, b.get_support()]
    
    # Save the best features selected each time.
    best_features_list.append(np.where(b.get_support())[0])
    
    clf.fit(xtrain, ytrain)
    scores.append(clf.score(xtest, ytest))
    
    y_pred = clf.predict(xtest)
    plt.plot(y_pred, ytest, 'o')
    plt.plot(ytest, ytest, 'r-')

plt.xlabel('Predicted')
plt.ylabel('Observed')

print "CV Score (R^2) is:", np.mean(scores)


CV Score (R^2) is: -1.08858015915

In [ ]:


In [34]:
import pprint

pprint.pprint(best_features_list)


[array([595299, 932157], dtype=int64),
 array([137514, 618756], dtype=int64),
 array([181178, 617072], dtype=int64),
 array([658208, 961409], dtype=int64),
 array([615455, 895830], dtype=int64)]
[-1.296585903714977,
 0.0035047490530437653,
 -1.883693928785247,
 -2.0743377549307271,
 -0.19178795737601195]

The same K-best features never show up twice between any of the 5 folds! For now, we have properly found detected overfitting by finding that our predictions were pretty awful for the holdout fold in each iteration.

The abysmal average R^2 shows this as well. We also were never able to detect the true correlated columns (5 and 10.) This may not be possible with only 20 total datapoints to work with, but at least we can know when our model isn't generalizing to new out of sample data.


In [ ]: